import os
import gzip
from numpy import median
from Bio import SeqIO


directory = "/osc-fs_home/mdehoon/Data/CASPARs/MiSeq/Fastq"
filenames = os.listdir(directory)
filenames.sort()

counts = {}
for filename1, filename2 in zip(filenames[::2], filenames[1::2]):
    assert filename1.replace("READ1", "READ2") == filename2
    basename, extension = os.path.splitext(filename1)
    assert extension == ".gz"
    library, extension = os.path.splitext(basename)
    assert extension == ".fq"
    sample, replicate, read1 = library.split("_")
    assert replicate in ("r1", "r2", "r3")
    assert read1 == "READ1"
    path = os.path.join(directory, filename1)
    print("Reading", path)
    stream = gzip.open(path, 'rt')
    sequences = SeqIO.parse(stream, "fastq")
    n1 = 0
    for sequence in sequences:
        n1 += 1
    stream.close()
    path = os.path.join(directory, filename2)
    print("Reading", path)
    stream = gzip.open(path, 'rt')
    sequences = SeqIO.parse(stream, "fastq")
    n2 = 0
    for sequence in sequences:
        n2 += 1
    stream.close()
    condition = (sample, replicate)
    assert condition not in counts
    counts[condition] = n1

# Sorted in the same order as in Supplementary Table S1:
conditions = (('t00', 'r1'),
              ('t00', 'r2'),
              ('t00', 'r3'),
              ('t01', 'r1'),
              ('t01', 'r2'),
              ('t01', 'r3'),
              ('t04', 'r1'),
              ('t04', 'r2'),
              ('t04', 'r3'),
              ('t12', 'r1'),
              ('t12', 'r2'),
              ('t12', 'r3'),
              ('t24', 'r1'),
              ('t24', 'r2'),
              ('t24', 'r3'),
              ('t96', 'r1'),
              ('t96', 'r2'),
              ('t96', 'r3'),
              ('c91', 'r1'),
              ('c91', 'r2'),
              ('c91', 'r3'),
              ('n91', 'r1'),
              ('n91', 'r2'),
              ('n91', 'r3'),
              ('lip', 'r1'),
              ('cel', 'r1'),
              ('neg', 'r1'),
              ('myb', 'r1'),
              ('myb', 'r2'),
              ('myb', 'r3'),
              ('gfi', 'r1'),
              ('gfi', 'r2'),
              ('gfi', 'r3'),
              ('nkd', 'r1'),
              ('nkd', 'r2'),
              ('nkd', 'r3'),
             )

for condition in conditions:
    sample, replicate = condition
    count = counts[(sample, replicate)]
    print("%s\t%s\t%d" % (sample, replicate, count))

m = median(list(counts.values()))
print("Median: %s reads" % m)
count = counts[('neg', 'r1')]
percentage = 100 * count / m
print("Protocol negative control: %d reads (%.1f%% of median)" % (count, percentage))
